library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
library(knitr)
library(DT)
library(pals)
library(splines)
library(MASS)
##
## Attachement du package : 'MASS'
## L'objet suivant est masqué depuis 'package:plotly':
##
## select
## L'objet suivant est masqué depuis 'package:dplyr':
##
## select
df_claims = read.csv("14OURE-PG_2017_CLAIMS_YEAR0.csv")
df_year0 = read.csv("14OURE-PG_2017_YEAR0.csv")
df_year1 = read.csv("14OURE-PG_2017_YEAR1.csv")
df_regions = read.csv("departements_francais.csv", sep = ";") %>% dplyr::select(NUMERO, REGION, "DENSITE..habitants.km2.")
df_year0 = df_year0 %>% mutate(NUMERO = substr(df_year0$pol_insee_code,1,2))
df_year0 = left_join(df_year0, df_regions, by = "NUMERO" )
df_year0$DENSITE..habitants.km2. <- as.numeric(gsub(",", ".", df_year0$DENSITE..habitants.km2.)) #convert density to numeric
names(df_year0)[names(df_year0)=="DENSITE..habitants.km2."] <- 'density'
df_claims_0 = df_claims[(df_claims$claim_amount >= 0),]
# create final merged df
df = left_join(df_year0, df_claims_0, by = c("id_policy")) %>%
mutate(claim_nb = ifelse(is.na(claim_nb), 0, claim_nb), claim_amount=ifelse(is.na(claim_amount), 0, claim_amount))
df <- df[df$claim_amount > 0,] #keep individuals claims for modelisation
df$drv_age1G <- cut(df$drv_age1, c(17, 2:8*10, max(df$drv_age1)))
df$drv_age2G <- cut(df$drv_age1, c(17, 25, 45,max(df$drv_age1)))
df$vh_ageG <- cut(df$vh_age, c(0, 10, 25, 30, 100))#no vehicle with age 0, min value is 1 in the dataset
df$vh_valueG = cut(df$vh_value, c(min(df$vh_value), 8000, 15000, 22000, max(df$vh_value)), include.lowest = TRUE)
df$vh_dinG = cut(df$vh_din, c(min(df$vh_din), 50, 220, max(df$vh_din)), include.lowest = TRUE)
df$vh_makeG = ifelse(df$vh_make %in% c("RENAULT", "NISSAN", "CITROEN"), "A",
ifelse(df$vh_make %in% c("VOLKSWAGEN", "AUDI", "SKODA", "SEAT"), "B",
ifelse(df$vh_make %in% c("OPEL", "GENERAL MOTORS", "FORD"), "C",
ifelse(df$vh_make %in% c("FIAT"), "D",
ifelse(df$vh_make %in% c("MERCEDES BENZ", "BMW", "CITROEN"), "E", "G")))))
df$densityG<- cut(df$density,c(0,40,200,500,4500,Inf),
include.lowest = TRUE)
df$pol_bonusG = cut(df$pol_bonus, c(0.5, 0.9, 1.2, 1.4, max(df$pol_bonus)), include.lowest = TRUE)
df$pol_durationG = cut(df$pol_duration, c(min(df$pol_duration),10, 20,25,30, max(df$pol_duration)), include.lowest = TRUE)
df$pol_coverageG = ifelse(df$pol_coverage %in% c("Median1", "Median2"), "Median", df$pol_coverage)
var <-c("drv_age1","drv_age1G","drv_age2G" ,"vh_ageG", "vh_age" , "vh_valueG", "vh_value" ,"vh_din", "vh_dinG","vh_type", "vh_fuel", "vh_makeG", "densityG","density","pol_bonus", "pol_bonusG" ,"pol_durationG", "pol_coverageG", "claim_amount")
#keep features we worked on andh the original for some continous ones
df <- df[var]
formula = log(claim_amount)~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel
reg.logn <- lm(formula ,data=df)#[df$claim_amount<15000,]
summary(reg.logn)
##
## Call:
## lm(formula = formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2603 -1.0583 0.0478 1.1342 6.7410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.470823 0.221437 24.706 < 2e-16 ***
## drv_age2G(25,45] -0.233306 0.124101 -1.880 0.06013 .
## drv_age2G(45,103] -0.083907 0.123786 -0.678 0.49789
## vh_ageG(10,25] -0.138167 0.033068 -4.178 2.96e-05 ***
## vh_ageG(25,30] 0.525206 0.220643 2.380 0.01731 *
## vh_ageG(30,100] -0.043645 0.309494 -0.141 0.88786
## vh_valueG(8e+03,1.5e+04] -0.074437 0.176655 -0.421 0.67349
## vh_valueG(1.5e+04,2.2e+04] 0.061569 0.178986 0.344 0.73086
## vh_valueG(2.2e+04,1.32e+05] 0.099337 0.180214 0.551 0.58150
## vh_dinG(50,220] 0.725777 0.161174 4.503 6.76e-06 ***
## vh_dinG(220,507] 1.240460 0.207559 5.976 2.34e-09 ***
## vh_makeGB -0.007619 0.051079 -0.149 0.88143
## vh_makeGC 0.068735 0.053510 1.285 0.19898
## vh_makeGD 0.125928 0.089159 1.412 0.15786
## vh_makeGE 0.155744 0.067493 2.308 0.02104 *
## vh_makeGG 0.082177 0.032518 2.527 0.01151 *
## densityG(40,200] 0.051032 0.064408 0.792 0.42819
## densityG(200,500] 0.129857 0.069013 1.882 0.05991 .
## densityG(500,4.5e+03] -0.008089 0.078408 -0.103 0.91783
## densityG(4.5e+03,Inf] 0.130164 0.077793 1.673 0.09431 .
## pol_bonusG(0.9,1.2] 0.208619 0.107375 1.943 0.05205 .
## pol_bonusG(1.2,1.4] 0.821494 0.600486 1.368 0.17132
## pol_bonusG(1.4,1.56] 0.833189 0.791647 1.052 0.29260
## pol_durationG(10,20] 0.070193 0.033221 2.113 0.03463 *
## pol_durationG(20,25] -0.079843 0.052963 -1.508 0.13170
## pol_durationG(25,30] 0.120919 0.052681 2.295 0.02173 *
## pol_durationG(30,39] -0.217601 0.146913 -1.481 0.13859
## pol_coverageGMedian -0.780744 0.033137 -23.561 < 2e-16 ***
## pol_coverageGMini -0.768901 0.064257 -11.966 < 2e-16 ***
## vh_typeTourism -0.165298 0.051268 -3.224 0.00127 **
## vh_fuelGasoline 0.019220 0.033451 0.575 0.56559
## vh_fuelHybrid 0.191175 0.424205 0.451 0.65224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.581 on 12992 degrees of freedom
## Multiple R-squared: 0.06464, Adjusted R-squared: 0.06241
## F-statistic: 28.96 on 31 and 12992 DF, p-value: < 2.2e-16
formula <- claim_amount ~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel
reg.gamma <- glm(formula,family=Gamma(link="log"),data=df[df$claim_amount<15000,])#focus on subset
summary(reg.gamma)
##
## Call:
## glm(formula = formula, family = Gamma(link = "log"), data = df[df$claim_amount <
## 15000, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7930 -1.5489 -0.8305 0.1299 6.1798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.632420 0.233736 28.376 < 2e-16 ***
## drv_age2G(25,45] -0.185768 0.130835 -1.420 0.15567
## drv_age2G(45,103] -0.076800 0.130499 -0.589 0.55620
## vh_ageG(10,25] -0.208069 0.034931 -5.957 2.64e-09 ***
## vh_ageG(25,30] 0.324265 0.234872 1.381 0.16742
## vh_ageG(30,100] -0.076819 0.326417 -0.235 0.81395
## vh_valueG(8e+03,1.5e+04] 0.063366 0.187388 0.338 0.73525
## vh_valueG(1.5e+04,2.2e+04] 0.131967 0.189850 0.695 0.48700
## vh_valueG(2.2e+04,1.32e+05] 0.141246 0.191145 0.739 0.45995
## vh_dinG(50,220] 0.394551 0.170250 2.317 0.02049 *
## vh_dinG(220,507] 0.587790 0.219085 2.683 0.00731 **
## vh_makeGB -0.022290 0.053888 -0.414 0.67915
## vh_makeGC 0.078971 0.056474 1.398 0.16203
## vh_makeGD 0.124002 0.094004 1.319 0.18715
## vh_makeGE 0.157463 0.071254 2.210 0.02713 *
## vh_makeGG 0.041687 0.034328 1.214 0.22463
## densityG(40,200] 0.037201 0.067908 0.548 0.58383
## densityG(200,500] 0.084991 0.072770 1.168 0.24286
## densityG(500,4.5e+03] 0.029857 0.082703 0.361 0.71810
## densityG(4.5e+03,Inf] 0.092881 0.082039 1.132 0.25759
## pol_bonusG(0.9,1.2] 0.199666 0.113200 1.764 0.07778 .
## pol_bonusG(1.2,1.4] 0.454897 0.633061 0.719 0.47242
## pol_bonusG(1.4,1.56] 0.481863 0.834567 0.577 0.56369
## pol_durationG(10,20] 0.008678 0.035061 0.248 0.80452
## pol_durationG(20,25] -0.039237 0.055956 -0.701 0.48318
## pol_durationG(25,30] 0.125322 0.055630 2.253 0.02429 *
## pol_durationG(30,39] -0.425333 0.158234 -2.688 0.00720 **
## pol_coverageGMedian -0.462150 0.034975 -13.214 < 2e-16 ***
## pol_coverageGMini -0.490353 0.067748 -7.238 4.81e-13 ***
## vh_typeTourism -0.109370 0.054166 -2.019 0.04349 *
## vh_fuelGasoline 0.044157 0.035325 1.250 0.21131
## vh_fuelHybrid -0.144775 0.447207 -0.324 0.74615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 2.776586)
##
## Null deviance: 27685 on 12989 degrees of freedom
## Residual deviance: 26758 on 12958 degrees of freedom
## AIC: 201470
##
## Number of Fisher Scoring iterations: 9
reg.invgaus <- glm(formula ,family=inverse.gaussian(link="1/mu^2"),data=df[df$claim_amount<15000,])
summary(reg.invgaus)
##
## Call:
## glm(formula = formula, family = inverse.gaussian(link = "1/mu^2"),
## data = df[df$claim_amount < 15000, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.76730 -0.08070 -0.03203 0.00425 0.17955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.780e-06 8.381e-07 3.317 0.000913 ***
## drv_age2G(25,45] 2.200e-07 2.112e-07 1.042 0.297530
## drv_age2G(45,103] 5.698e-09 2.059e-07 0.028 0.977926
## vh_ageG(10,25] 5.028e-07 9.085e-08 5.534 3.20e-08 ***
## vh_ageG(25,30] -2.876e-07 1.702e-07 -1.690 0.091051 .
## vh_ageG(30,100] -9.342e-08 7.689e-07 -0.121 0.903301
## vh_valueG(8e+03,1.5e+04] 1.635e-08 5.293e-07 0.031 0.975362
## vh_valueG(1.5e+04,2.2e+04] -1.265e-07 5.310e-07 -0.238 0.811790
## vh_valueG(2.2e+04,1.32e+05] -1.421e-07 5.324e-07 -0.267 0.789563
## vh_dinG(50,220] -2.005e-06 7.906e-07 -2.536 0.011223 *
## vh_dinG(220,507] -2.182e-06 8.108e-07 -2.691 0.007123 **
## vh_makeGB 7.423e-08 1.235e-07 0.601 0.547831
## vh_makeGC -1.122e-07 1.111e-07 -1.010 0.312541
## vh_makeGD -1.804e-07 1.758e-07 -1.026 0.304706
## vh_makeGE -1.981e-07 1.133e-07 -1.749 0.080300 .
## vh_makeGG -7.261e-08 7.167e-08 -1.013 0.311033
## densityG(40,200] -5.709e-08 1.540e-07 -0.371 0.710888
## densityG(200,500] -1.337e-07 1.615e-07 -0.828 0.407585
## densityG(500,4.5e+03] -6.665e-08 1.811e-07 -0.368 0.712835
## densityG(4.5e+03,Inf] -1.279e-07 1.741e-07 -0.735 0.462630
## pol_bonusG(0.9,1.2] -2.992e-07 1.916e-07 -1.562 0.118422
## pol_bonusG(1.2,1.4] -4.141e-07 6.611e-07 -0.626 0.531048
## pol_bonusG(1.4,1.56] -6.911e-07 9.208e-07 -0.751 0.452934
## pol_durationG(10,20] -1.023e-08 7.166e-08 -0.143 0.886522
## pol_durationG(20,25] 6.808e-08 1.204e-07 0.565 0.571819
## pol_durationG(25,30] -1.879e-07 9.129e-08 -2.058 0.039581 *
## pol_durationG(30,39] 1.585e-06 7.308e-07 2.169 0.030135 *
## pol_coverageGMedian 1.355e-06 1.248e-07 10.856 < 2e-16 ***
## pol_coverageGMini 1.382e-06 2.610e-07 5.294 1.22e-07 ***
## vh_typeTourism 1.817e-07 9.471e-08 1.919 0.055042 .
## vh_fuelGasoline -6.639e-08 7.330e-08 -0.906 0.365140
## vh_fuelHybrid 1.771e-07 9.382e-07 0.189 0.850268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.003277493)
##
## Null deviance: 165.75 on 12989 degrees of freedom
## Residual deviance: 164.70 on 12958 degrees of freedom
## AIC: 206785
##
## Number of Fisher Scoring iterations: 10
mean(df[df$claim_amount<15000, 'claim_amount']) #empirical mean
## [1] 972.7987
sigma <- summary(reg.logn)$sigma
mean(exp(predict(reg.logn))*exp(sigma^2/2)) #mean for log normal
## [1] 1278.154
mean(predict(reg.gamma,type="response")) #mean gamma
## [1] 971.0181
mean(predict(reg.invgaus,type="response"))# mean inverse gaussian
## [1] 972.8024
ordered_claim <- df[order(-df$claim_amount),c("claim_amount","drv_age1","vh_age","vh_value","vh_fuel","density")]
ordered_claim$aggamount <-cumsum(ordered_claim$claim_amount)/sum(ordered_claim$claim_amount)*100
ordered_claim$aggnb <- (1:length(df$claim_amount))/length(df$claim_amount) * 100
#ordered_claim[ordered_claim$aggnb >= 2.5,c('claim_amount','aggamount', 'aggnb')] # 57 sinistres comptent pour 10% des montants totaux !
print(tail(ordered_claim[ordered_claim$claim_amount >= 6000,c('claim_amount','aggamount', 'aggnb')]))
## claim_amount aggamount aggnb
## 86871 6050.00 25.33399 2.426290
## 62114 6047.08 25.37817 2.433968
## 46670 6039.41 25.42230 2.441646
## 33169 6034.74 25.46639 2.449324
## 54947 6024.31 25.51041 2.457002
## 28101 6002.17 25.55427 2.464681
Nous avons constaté par une étude descriptive que seulement 5% des sinsitres représentaient près de 40% de la charge sinistre totale. Nous choisissons un niveau de séparation de 6000€. Les sinistres supérieurs à ce montant représentent alors 2.46% des sinistres de la base ainsi que 25.5% de la charge sinistre totale.
u <- 6000 #valeur de séparation
su<- sum(ifelse(df$claim_amount - u > 0, df$claim_amount - u, 0))/length(df$claim_amount) #charge surcrête moyenne
df$surcrete <- ifelse (df$claim_amount<u, df$claim_amount, u) + su
summary(df$surcrete)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 121.0 236.6 485.2 1050.8 1238.2 6120.7
Nous pouvons maintenant proposer une modélisation pour les nouveaux montants encrếtés.
formula <- surcrete ~drv_age2G + vh_ageG +vh_valueG + vh_dinG +vh_makeG + densityG + pol_bonusG + pol_durationG +pol_coverageG + vh_type + vh_fuel
reg.logn <- lm(log(surcrete)~drv_age1 + vh_ageG +vh_value + pol_durationG ,data=df)
summary(reg.logn)
##
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_value +
## pol_durationG, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0901 -0.8728 -0.1670 0.7576 2.8294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.961e+00 4.184e-02 142.481 < 2e-16 ***
## drv_age1 3.502e-03 6.264e-04 5.591 2.30e-08 ***
## vh_ageG(10,25] -1.104e-01 2.121e-02 -5.207 1.95e-07 ***
## vh_ageG(25,30] 3.490e-01 1.445e-01 2.415 0.015754 *
## vh_ageG(30,100] -9.108e-02 1.891e-01 -0.482 0.630161
## vh_value 1.072e-05 1.059e-06 10.124 < 2e-16 ***
## pol_durationG(10,20] 6.368e-02 2.191e-02 2.907 0.003656 **
## pol_durationG(20,25] -2.462e-02 3.499e-02 -0.704 0.481614
## pol_durationG(25,30] 1.299e-01 3.475e-02 3.737 0.000187 ***
## pol_durationG(30,39] -1.819e-01 9.730e-02 -1.870 0.061562 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.048 on 13014 degrees of freedom
## Multiple R-squared: 0.01672, Adjusted R-squared: 0.01604
## F-statistic: 24.59 on 9 and 13014 DF, p-value: < 2.2e-16
reg.gamma <- glm(surcrete~drv_age1 + vh_ageG +vh_value + pol_durationG ,family=Gamma(link="log"),data=df)
summary(reg.gamma)
##
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_value + pol_durationG,
## family = Gamma(link = "log"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8244 -1.1831 -0.6736 0.1699 3.1478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.672e+00 5.109e-02 130.601 < 2e-16 ***
## drv_age1 2.441e-03 7.649e-04 3.192 0.001416 **
## vh_ageG(10,25] -1.535e-01 2.590e-02 -5.927 3.15e-09 ***
## vh_ageG(25,30] 3.837e-01 1.764e-01 2.174 0.029687 *
## vh_ageG(30,100] -6.913e-02 2.310e-01 -0.299 0.764711
## vh_value 8.183e-06 1.293e-06 6.327 2.58e-10 ***
## pol_durationG(10,20] 4.566e-02 2.675e-02 1.707 0.087858 .
## pol_durationG(20,25] -2.740e-03 4.272e-02 -0.064 0.948860
## pol_durationG(25,30] 1.608e-01 4.243e-02 3.791 0.000151 ***
## pol_durationG(30,39] -4.279e-02 1.188e-01 -0.360 0.718738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.637889)
##
## Null deviance: 15664 on 13023 degrees of freedom
## Residual deviance: 15453 on 13014 degrees of freedom
## AIC: 207252
##
## Number of Fisher Scoring iterations: 6
reg.invgaus <- glm(surcrete~drv_age1 + vh_ageG +vh_value + pol_durationG ,family=inverse.gaussian(link="1/mu^2"),data=df)
summary(reg.invgaus)
##
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_value + pol_durationG,
## family = inverse.gaussian(link = "1/mu^2"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.082251 -0.049925 -0.024162 0.005089 0.084602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.313e-06 8.245e-08 15.923 < 2e-16 ***
## drv_age1 -4.671e-09 1.373e-09 -3.402 0.00067 ***
## vh_ageG(10,25] 3.236e-07 5.422e-08 5.969 2.44e-09 ***
## vh_ageG(25,30] -4.566e-07 1.661e-07 -2.748 0.00600 **
## vh_ageG(30,100] 2.312e-07 5.160e-07 0.448 0.65406
## vh_value -7.987e-12 2.944e-13 -27.132 < 2e-16 ***
## pol_durationG(10,20] -8.678e-08 4.736e-08 -1.832 0.06693 .
## pol_durationG(20,25] -1.111e-09 7.924e-08 -0.014 0.98881
## pol_durationG(25,30] -2.620e-07 6.349e-08 -4.127 3.69e-05 ***
## pol_durationG(30,39] 1.604e-07 2.526e-07 0.635 0.52552
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.001594935)
##
## Null deviance: 22.871 on 13023 degrees of freedom
## Residual deviance: 22.682 on 13014 degrees of freedom
## AIC: 202582
##
## Number of Fisher Scoring iterations: 11
Nous optons pour un modèle par écrêtement par souci de simplicité par la suite. Nous allons dans cette partie procéder à une sélection de variables par Stepwise AIC et comparer les modèles obtenue en sortie.
reg.gamma.full <- glm(surcrete~.,family=Gamma(link="log"),data=subset(df, select= -c(claim_amount))) #do not keep claim amount !
gamma_backward <- step(reg.gamma.full, direction = 'backward')
summary(gamma_backward)
reg.gamma.zero <- glm(surcrete~1,family=Gamma(link="log"),data=na.omit(subset(df, select= -c(claim_amount)))) #do not keep claim amount !
gamma_forward <- step(reg.gamma.zero, scope = list(lower=formula(reg.gamma.zero),upper=formula(reg.gamma.full)), direction = 'forward')
summary(gamma_forward)
gamma_stepwise <- step(reg.gamma.full, direction = 'both')
summary(gamma_stepwise)
Nous comparons les résultats de ces 3 modèles : en particulier, nous remarquerons que les variables sélectionnées sont régulièrement les mêmes, peu importe la méthode. Finalement, nous retiendrons la méthode stepwise qui fournit des résultats similaires à la méthode backward mais semble plus précautionneuse. Nous devons tout de même relever que les améliorations sur l’AIC sont marginales ! Les variables retenues sont souvent assez similaires. Nous remarquons que certaines variables peuvent êtres présents à la fois en groupe et en continue, nous conservons manuellement l’une d’entre elle seulement.
reg.invgaus.full <- glm(surcrete~. ,family=inverse.gaussian(link="1/mu^2"),data=na.omit(subset(df, select=-c(claim_amount))))
invgaus_stepwise <- step(reg.invgaus.full, direction = 'both')
#summary(reg.invgaus)
summary(invgaus_stepwise)
##
## Call:
## glm(formula = surcrete ~ drv_age2G + vh_ageG + vh_din + vh_dinG +
## vh_type + pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "1/mu^2"),
## data = na.omit(subset(df, select = -c(claim_amount))))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.082577 -0.049352 -0.023877 0.004786 0.104212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.008e-06 4.765e-07 4.214 2.52e-05 ***
## drv_age2G(25,45] 2.135e-07 1.610e-07 1.326 0.18486
## drv_age2G(45,103] 5.207e-08 1.636e-07 0.318 0.75036
## vh_ageG(10,25] 2.871e-07 5.511e-08 5.210 1.92e-07 ***
## vh_ageG(25,30] -3.344e-07 1.316e-07 -2.541 0.01108 *
## vh_ageG(30,100] -1.345e-07 4.631e-07 -0.291 0.77142
## vh_din -2.018e-09 4.272e-10 -4.723 2.34e-06 ***
## vh_dinG(50,220] -1.134e-06 4.061e-07 -2.792 0.00525 **
## vh_dinG(220,507] -1.014e-06 4.464e-07 -2.271 0.02318 *
## vh_typeTourism 1.945e-07 6.172e-08 3.152 0.00163 **
## pol_bonus -4.294e-07 2.145e-07 -2.001 0.04538 *
## pol_durationG(10,20] -4.861e-08 4.785e-08 -1.016 0.30977
## pol_durationG(20,25] 3.783e-08 8.041e-08 0.470 0.63806
## pol_durationG(25,30] -1.806e-07 6.105e-08 -2.959 0.00310 **
## pol_durationG(30,39] 1.316e-07 2.559e-07 0.514 0.60711
## pol_coverageGMedian 9.378e-07 7.461e-08 12.570 < 2e-16 ***
## pol_coverageGMini 9.117e-07 1.523e-07 5.986 2.21e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.001741437)
##
## Null deviance: 22.871 on 13023 degrees of freedom
## Residual deviance: 22.218 on 13007 degrees of freedom
## AIC: 202327
##
## Number of Fisher Scoring iterations: 9
reg.logn.full <- lm(log(surcrete) ~. , data=na.omit(subset(df, select=-c(claim_amount))))
logn_stepwise <- step(reg.logn.full, direction = 'both')
summary(logn_stepwise)
##
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + drv_age2G + vh_ageG +
## vh_age + vh_valueG + vh_value + vh_din + vh_dinG + vh_type +
## densityG + pol_bonus + pol_durationG + pol_coverageG, data = na.omit(subset(df,
## select = -c(claim_amount))))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1426 -0.8344 -0.1797 0.7235 2.8935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.852e+00 1.732e-01 33.781 < 2e-16 ***
## drv_age1 2.125e-03 9.575e-04 2.219 0.026478 *
## drv_age2G(25,45] -1.812e-01 8.251e-02 -2.196 0.028130 *
## drv_age2G(45,103] -1.258e-01 8.951e-02 -1.405 0.160050
## vh_ageG(10,25] -1.843e-01 3.512e-02 -5.247 1.57e-07 ***
## vh_ageG(25,30] 1.545e-01 1.573e-01 0.982 0.325989
## vh_ageG(30,100] -2.496e-01 2.189e-01 -1.140 0.254170
## vh_age 8.244e-03 3.066e-03 2.689 0.007183 **
## vh_valueG(8e+03,1.5e+04] -9.488e-02 1.148e-01 -0.826 0.408660
## vh_valueG(1.5e+04,2.2e+04] -8.695e-02 1.176e-01 -0.739 0.459858
## vh_valueG(2.2e+04,1.32e+05] -1.807e-01 1.242e-01 -1.455 0.145704
## vh_value 7.421e-06 2.790e-06 2.660 0.007825 **
## vh_din 1.423e-03 6.141e-04 2.317 0.020521 *
## vh_dinG(50,220] 3.964e-01 1.061e-01 3.737 0.000187 ***
## vh_dinG(220,507] 3.717e-01 1.560e-01 2.383 0.017190 *
## vh_typeTourism -1.213e-01 3.337e-02 -3.636 0.000278 ***
## densityG(40,200] 3.060e-02 4.183e-02 0.732 0.464435
## densityG(200,500] 8.379e-02 4.482e-02 1.869 0.061587 .
## densityG(500,4.5e+03] 1.541e-02 5.093e-02 0.303 0.762224
## densityG(4.5e+03,Inf] 8.233e-02 5.053e-02 1.629 0.103284
## pol_bonus 2.531e-01 1.015e-01 2.493 0.012672 *
## pol_durationG(10,20] 4.730e-02 2.181e-02 2.169 0.030120 *
## pol_durationG(20,25] -4.770e-02 3.459e-02 -1.379 0.167958
## pol_durationG(25,30] 9.616e-02 3.441e-02 2.795 0.005200 **
## pol_durationG(30,39] -1.717e-01 9.549e-02 -1.798 0.072242 .
## pol_coverageGMedian -4.441e-01 2.154e-02 -20.622 < 2e-16 ***
## pol_coverageGMini -4.307e-01 4.173e-02 -10.321 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.027 on 12997 degrees of freedom
## Multiple R-squared: 0.05742, Adjusted R-squared: 0.05553
## F-statistic: 30.45 on 26 and 12997 DF, p-value: < 2.2e-16
reg.logn <- lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_dinG +
vh_type + pol_bonus + pol_durationG + pol_coverageG, data =df)
summary(reg.logn)
##
## Call:
## lm(formula = log(surcrete) ~ drv_age1 + vh_ageG + vh_dinG + vh_type +
## pol_bonus + pol_durationG + pol_coverageG, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0184 -0.8427 -0.1696 0.7294 2.8528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8292768 0.1195391 48.765 < 2e-16 ***
## drv_age1 0.0031412 0.0006584 4.771 1.85e-06 ***
## vh_ageG(10,25] -0.1328789 0.0209189 -6.352 2.19e-10 ***
## vh_ageG(25,30] 0.3169738 0.1430571 2.216 0.026728 *
## vh_ageG(30,100] -0.0209880 0.1942372 -0.108 0.913955
## vh_dinG(50,220] 0.4307782 0.0886671 4.858 1.20e-06 ***
## vh_dinG(220,507] 0.8418149 0.1210098 6.957 3.65e-12 ***
## vh_typeTourism -0.1140722 0.0322895 -3.533 0.000413 ***
## pol_bonus 0.3297129 0.0944027 3.493 0.000480 ***
## pol_durationG(10,20] 0.0466793 0.0218504 2.136 0.032672 *
## pol_durationG(20,25] -0.0506218 0.0346514 -1.461 0.144071
## pol_durationG(25,30] 0.0952796 0.0344595 2.765 0.005701 **
## pol_durationG(30,39] -0.1681721 0.0956966 -1.757 0.078882 .
## pol_coverageGMedian -0.4527098 0.0215025 -21.054 < 2e-16 ***
## pol_coverageGMini -0.4343161 0.0417423 -10.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.03 on 13009 degrees of freedom
## Multiple R-squared: 0.05128, Adjusted R-squared: 0.05026
## F-statistic: 50.23 on 14 and 13009 DF, p-value: < 2.2e-16
perf_model <- function(model){
return(list("AIC"= AIC(model), "BIC"= BIC(model), "log-vraisemblance" = logLik(model), "Deviance" =model$deviance, "Null dev." = model$null.deviance))
}
list("AIC"= AIC(reg.logn), "BIC"= BIC(reg.logn), "log-vraisemblance" = logLik(reg.logn))
## $AIC
## [1] 37740.71
##
## $BIC
## [1] 37860.3
##
## $`log-vraisemblance`
## 'log Lik.' -18854.35 (df=16)
reg.gamma <- glm(surcrete~ drv_age1 + vh_ageG + vh_dinG +
vh_type + pol_bonus + pol_durationG + pol_coverageG ,family=Gamma(link="log"),data=df)
perf_model(reg.gamma)
## $AIC
## [1] 206876.6
##
## $BIC
## [1] 206996.2
##
## $`log-vraisemblance`
## 'log Lik.' -103422.3 (df=16)
##
## $Deviance
## [1] 15068.94
##
## $`Null dev.`
## [1] 15664.3
reg.loggamma <- glm(log(surcrete)~ drv_age1 + vh_ageG + vh_dinG +
vh_type + pol_bonus + pol_durationG + pol_coverageG ,family=Gamma(link='identity'),data=df)
perf_model(reg.loggamma)
## $AIC
## [1] 37056.46
##
## $BIC
## [1] 37176.06
##
## $`log-vraisemblance`
## 'log Lik.' -18512.23 (df=16)
##
## $Deviance
## [1] 331.389
##
## $`Null dev.`
## [1] 349.8797
reg.invgaus <- glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_dinG +
vh_type + pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "log"), start=coefficients(reg.gamma), data = subset(df, select = -c(claim_amount)))
summary(reg.invgaus)
##
## Call:
## glm(formula = surcrete ~ drv_age1 + vh_ageG + vh_dinG + vh_type +
## pol_bonus + pol_durationG + pol_coverageG, family = inverse.gaussian(link = "log"),
## data = subset(df, select = -c(claim_amount)), start = coefficients(reg.gamma))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.081591 -0.049556 -0.023912 0.004855 0.105755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5249942 0.1405852 46.413 < 2e-16 ***
## drv_age1 0.0021746 0.0008516 2.553 0.010678 *
## vh_ageG(10,25] -0.1053311 0.0261517 -4.028 5.66e-05 ***
## vh_ageG(25,30] 0.3270079 0.2159604 1.514 0.129999
## vh_ageG(30,100] -0.1090480 0.2202828 -0.495 0.620583
## vh_dinG(50,220] 0.3274655 0.0948179 3.454 0.000555 ***
## vh_dinG(220,507] 0.6290825 0.1574529 3.995 6.49e-05 ***
## vh_typeTourism -0.0853282 0.0425474 -2.005 0.044932 *
## pol_bonus 0.3051741 0.1217959 2.506 0.012236 *
## pol_durationG(10,20] 0.0333220 0.0280969 1.186 0.235656
## pol_durationG(20,25] -0.0334847 0.0437277 -0.766 0.443837
## pol_durationG(25,30] 0.1315066 0.0468801 2.805 0.005036 **
## pol_durationG(30,39] -0.0304020 0.1189563 -0.256 0.798285
## pol_coverageGMedian -0.3948936 0.0254549 -15.513 < 2e-16 ***
## pol_coverageGMini -0.3931200 0.0476602 -8.248 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.001723602)
##
## Null deviance: 22.871 on 13023 degrees of freedom
## Residual deviance: 22.292 on 13009 degrees of freedom
## AIC: 202366
##
## Number of Fisher Scoring iterations: 5
perf_model(reg.invgaus)
## $AIC
## [1] 202366.2
##
## $BIC
## [1] 202485.8
##
## $`log-vraisemblance`
## 'log Lik.' -101167.1 (df=16)
##
## $Deviance
## [1] 22.29224
##
## $`Null dev.`
## [1] 22.87138
#df$prediction<-p
plotgroupresiduals <- function(object, m=100, trim=TRUE, main=NULL, ...)
{
yh <- fitted.values(object)
re <- residuals(object)
if(trim)
ind <- abs(re) <= quantile(abs(re), probs=.99) & abs(yh) <= quantile(abs(yh), probs=.99)
else
ind <- 1:length(re)
yh <- yh[ind]
re <- re[ind]
n <- length(yh)
ind <- sample(1:n, n)
yh <- yh[ind]
re <- re[ind]
#group
if(m > 1)
{
yhg <- rowMeans(matrix(yh, ncol=m))
reg <- rowMeans(matrix(re, ncol=m))
plot(yhg, reg, ylab = "Group residuals", xlab="Group fitted values", main=main, ...)
}else
plot(yh, re, ylab = "Residuals", xlab="Fitted values", main=main, ...)
abline(h=0, lty=3, col="grey")
}
par(mfrow=c(2,2))
plotgroupresiduals(reg.invgaus, m=1, main = "Inverse Gaussian")
plotgroupresiduals(reg.logn, m=1, main ="Lognormal")
plotgroupresiduals(reg.gamma, m=1, main = "Gamma")
plotgroupresiduals(reg.loggamma, m=1, main = 'Loggamma')# Pas besoin de paquet car ccompare des résidus à des prédictions continues
## Mean prediction by classes
df$prediction <- predict(reg.invgaus, type ='response')
df$drv_age2G <- cut(df$drv_age1, c(17, 25, 45,100))
fig1 <- plot_ly(data =df, x = ~drv_age2G, y = ~claim_amount, type = "box", name = 'Boxplot Observed values') %>% layout(xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
fig2 <- plot_ly(data =df, x = ~drv_age2G, y = ~prediction, type = "box", name = 'Boxplot Inverse gaussian') %>% layout(xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
fig <- subplot(fig1, fig2) %>% layout(title = "Boxplot des espérance prédites par classes d'âge")
## Warning: Ignoring 1 observations
## Warning: Ignoring 1 observations
fig
Par ailleurs, nous pouvons également envisager une méthode de séparation plutôt que la méthode d’écrêtement implémentée ci-dessus. Nous pourrions classer les profils de risque selon leur probabilité d’avoir des montants de sinistres élevés ou standard avec les variables explicatives à disposition. Nous montrons ci-dessous un exemple réalisé à partir de l’âge des conducteurs.
u <- 6000
df$standard <- (df$claim_amount<u) # 1 or TRUE if amount is less than 6000, our target
age <- seq(18,85)
regC <- glm(standard~bs(drv_age1),data=df,family=binomial) #https://towardsdatascience.com/from-logistic-regression-to-basis-expansions-and-splines-74d6bb3b8dc6
#to understand bsplines, often used in logreg when non obvious linear relation
ypC <- predict(regC,newdata=data.frame(drv_age1=age),type="response",
se=TRUE) #predict proba for every age
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
plot(age,ypC$fit,ylim=c(.95,1),type="l",)
polygon(c(age,rev(age)),c(ypC$fit+2*ypC$se.fit,rev(ypC$fit-2*ypC$se.fit)),
col="grey",border=NA)
abline(h=mean(df$standard),lty=2)
Probabilité d’avoir un sinistre standard, étant donné qu’un sinistre est survenu, en fonction de l’âge du conducteur. l’âge du conducteur. Régression logistique avec un lisseur spline.
Nous réalisons maintenant deux modèles pour chaque type de sinistres (standard ou supérieux à 6000€).
indexstandard <- which(df$claim_amount<u)
mean(df$claim_amount[indexstandard])
## [1] 802.0671
mean(df$claim_amount[-indexstandard])
## [1] 10895.21
regA <- glm(claim_amount~bs(drv_age1),data=df[indexstandard,], family=Gamma(link="log"))
summary(regA)
##
## Call:
## glm(formula = claim_amount ~ bs(drv_age1), family = Gamma(link = "log"),
## data = df[indexstandard, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6888 -1.4842 -0.7335 0.2496 3.1197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.80797 0.09699 70.190 <2e-16 ***
## bs(drv_age1)1 -0.55370 0.26677 -2.076 0.038 *
## bs(drv_age1)2 0.30443 0.18572 1.639 0.101
## bs(drv_age1)3 -0.09539 0.29746 -0.321 0.748
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.844181)
##
## Null deviance: 24011 on 12702 degrees of freedom
## Residual deviance: 23958 on 12699 degrees of freedom
## AIC: 193703
##
## Number of Fisher Scoring iterations: 7
ypA <- predict(regA,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
regB <- glm(claim_amount~bs(drv_age1),data=df[-indexstandard,],family=Gamma(link="log"))
summary(regB)
##
## Call:
## glm(formula = claim_amount ~ bs(drv_age1), family = Gamma(link = "log"),
## data = df[-indexstandard, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6351 -0.4139 -0.2509 0.0201 6.5509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.53233 0.65599 13.007 <2e-16 ***
## bs(drv_age1)1 1.96836 1.53321 1.284 0.200
## bs(drv_age1)2 -0.08371 0.77285 -0.108 0.914
## bs(drv_age1)3 0.82435 1.04543 0.789 0.431
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 2.132098)
##
## Null deviance: 105.40 on 320 degrees of freedom
## Residual deviance: 99.62 on 317 degrees of freedom
## AIC: 6424.6
##
## Number of Fisher Scoring iterations: 8
ypB <- predict(regB,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(20L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
reg <- glm(claim_amount~bs(drv_age1),data=df,family=Gamma(link="log"))
yp <- predict(reg,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
ypC <- predict(regC,newdata=data.frame(drv_age1=age),type="response")
## Warning in bs(drv_age1, degree = 3L, knots = numeric(0), Boundary.knots =
## c(19L, : some 'x' values beyond boundary knots may cause ill-conditioned bases
plot(age,yp,type="l",lwd=2,ylab="Average cost",xlab="Age of the driver")
lines(age,ypC*ypA+(1-ypC)*ypB,type="h",col="grey",lwd=6)
#expliquer la formule : prix moyen : proba de standard * prix standard + proba large montant * prix large montant
lines(age,ypC*ypA,type="h",col="black",lwd=6)
abline(h= mean(df$claim_amount),lty=2)
La ligne horizontale représente le coût moyen d’un sinistre. La ligne sombre à l’arrière, est la prédiction sur l’ensemble de données. La partie sombre est la partie de la partie du sinistre moyen liée aux sinistres standard (inférieurs à s) et la zone plus claire est la part du sinistre moyen due à d’éventuels sinistres importants (supérieurs à s).